videogames.df <- read.csv(file.path(project.dir, dataset.dir, 'vgsales-12-4-2019.csv'))
colnames(videogames.df)
## [1] "Rank" "Name" "basename" "Genre"
## [5] "ESRB_Rating" "Platform" "Publisher" "Developer"
## [9] "VGChartz_Score" "Critic_Score" "User_Score" "Total_Shipped"
## [13] "Global_Sales" "NA_Sales" "PAL_Sales" "JP_Sales"
## [17] "Other_Sales" "Year" "Last_Update" "url"
## [21] "status" "Vgchartzscore" "img_url"
# Since the data was collected in April of 2019, we are excluding games with year = 2019 since it does not give a comprehensive picture of all the sales during 2019.
videogames.clean <- videogames.df %>% filter(Year < 2019)
# E was originally called KA for ESRB ratings, so we are going to make all the KA ratings E
videogames.clean <- videogames.clean %>% mutate(ESRB_Rating = replace(ESRB_Rating, ESRB_Rating=='KA', 'E'))
# Make give the ESRB rating levels for easier graphing/ data manipulation
unique(videogames.clean$ESRB_Rating)
## [1] "E" "" "M" "E10" "T" "RP" "EC" "AO"
videogames.clean$ESRB_Rating <- factor(videogames.clean$ESRB_Rating,levels = c('','RP','E', 'EC', 'E10','T','M','AO'))
We want to compare sales across different regions, so it would be convenient to have one column ‘region’ and then a corresponding column for sales in USD (millions).
vs_byregion <- videogames.clean %>% gather(Region, Sales, Global_Sales:Other_Sales, na.rm = T)
Conduct some descriptive analysis on the data, figuring out: * distributions of variables, * variables that appear to be strongly related with each other (using appropriate methods to quantify the relationships based on whether variables are numerical or categorical).
From the boxplot we can see that we have 2 extreme outliers. After investigating, it looks like two outliers are GTA V (ps3 and ps4)
boxplot(videogames.clean$Global_Sales, xlab = 'Global Sales (millions of USD)')
videogames.clean[which(videogames.clean$Global_Sales > 17), ]
hist(videogames.clean$Global_Sales,
xlab = 'Global Sales (millions of USD)',
xlim = c(0, .5),
breaks = 2000)
videogames.clean %>% arrange(desc(Global_Sales))
ggplotly(
videogames.clean %>%
count(Platform, sort = TRUE) %>%
ggplot(aes(x = reorder(Platform, -n), y = n)) +
geom_bar(stat = "identity",position = position_dodge(width=0)) +
theme(axis.text.x=element_text(angle=90,hjust=1, vjust = 0.5))
)
videogames.clean %>% ggplot(aes(x = ESRB_Rating)) +
geom_bar()
videogames.clean %>%
count(Genre, sort = TRUE) %>%
ggplot(aes(x = reorder(Genre, -n), y = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45,hjust=1))
Here we looked at distribution of User Scores and Critic Scores as well as the average Critic and User Score over time.
videogames.clean %>% ggplot() +
geom_histogram(binwidth = 0.5,aes(x = Critic_Score, fill = 'pink') ) +
geom_histogram(binwidth = 0.5,aes(x = User_Score, fill = 'blue') )
We have a ton of publishers
videogames.clean %>% ggplot(aes(x = Year)) +
geom_bar()
vs_sales.byregion.byyear <- vs_byregion %>%
group_by(Year, Region) %>%
summarize(SSales = sum(Sales))
vs_sales.byregion.byyear$MSales <- vs_byregion %>%
group_by(Year, Region) %>%
summarize(means = mean(Sales)) %>%
pull(means)
vs_sales.byregion.byyear %>% ggplot(aes(x=Year))+
geom_line(aes(y= SSales, color = Region))+
geom_line(linetype = "dotted", aes(y= MSales*100, color = Region))
videogames.clean %>% group_by(Year) %>% summarise(
User_Score = mean(User_Score, na.rm = T),
Critic_Score = mean(Critic_Score, na.rm = T),
Vgchartzscore = mean(Vgchartzscore, na.rm = T)) %>%
filter(Year >= 1989) %>%
mutate(User_Score2 = case_when(Year >= 1996 ~ User_Score,
TRUE ~ NaN)) %>%
gather(ScoreType, Score, c(User_Score,Critic_Score,Vgchartzscore), na.rm = T) %>%
ggplot(aes(x = Year)) + # TODO : Make look better
geom_line(aes(y = Score, color = ScoreType)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_continuous('ID', labels = 1980:2018, breaks = 1980:2018) +
xlab('ID') +
xlim(1989, 2018
)
videogames.clean <- videogames.clean %>% filter(Global_Sales < 17 & !is.na(Global_Sales))
⁃ construct CI for population mean value for sales
Total sample mean and CI
confidence <- 0.95
n <- length(videogames.clean$Global_Sales)
mu.hat.all <- mean(videogames.clean$Global_Sales)
sd.hat <- sd(videogames.clean$Global_Sales)
se <- sd.hat/sqrt(n)
alpha <- 1-confidence
CI <- c(mu.hat.all - qt(1-alpha/2, n-1)*se,
mu.hat.all + qt(1-alpha/2, n-1)*se)
mu.hat.all
## [1] 0.3650039
CI
## [1] 0.3535682 0.3764396
One sample inference and CI (2008)
The test result is statistically significant with a p value of 0.1805, and 95% confidence interval of [0.3095337 0.3778467]. This means that on average, we are 95% confident that
# using all sample mean as population mean:
mu0 <- mu.hat.all
samp2008 <- videogames.clean %>% filter(Year == 2008 & !is.na(Global_Sales)) %>% pull(Global_Sales)
mu.hat <- mean(samp2008)
t.test(samp2008, mu = mu0)
##
## One Sample t-test
##
## data: samp2008
## t = -1.2239, df = 1671, p-value = 0.2212
## alternative hypothesis: true mean is not equal to 0.3650039
## 95 percent confidence interval:
## 0.3095337 0.3778467
## sample estimates:
## mean of x
## 0.3436902
Check assumptions: * The sample is not randomized (vgchartz’s game database does not include all games and would have a bias towards including games that are available in english) * The population sales distribution is not normal at all (extreme right skew) * The dataset had two extreme outliers identified via boxplot, and removed.
Although the t-distribution CI is robust against non-normal populations, it is highly sensitive to violations of the random sampling assumption. Since our dataset would be missing a disproportionate amount of non-western games, and older games. So we likely have an undercoverage issue using the t-distribution method if we consider our population to be all video games that ever existed worldwide. But if we consider our population to be ???????????? then the CI we have is trustworthy
We can also use bootstrap to estimate the 95% CI for the mean of video game sales. I would not expect this result to be significantly different, or better than the t-distribution method because bootstrapping is also sensitive to non-random sampling, because the assumption is that our sample is a good representation of the population we are interested in.
Here we conducted a t test with 95% confidence intervals looking at sales for the Action Genre. We ignored any NA values and made sure to exclude any extreme outliers such as GTA V. H Null is Action Video Games mean sales = 0.367 and Alternative Hypothesis being Action Video Games mean sales != .367. We can see that the mean video game sales does not fall into our 95% confidence intervals for Action game sales. Thus, we reject the null hypothesis.
#Ignored all NA values
actions <- videogames.clean %>% filter(Genre == 'Action')
actionconf <- t.test(actions$Global_Sales, mu=0.365, conf.level = 0.95)
actionconf
##
## One Sample t-test
##
## data: actions$Global_Sales
## t = 2.2127, df = 2891, p-value = 0.02699
## alternative hypothesis: true mean is not equal to 0.365
## 95 percent confidence interval:
## 0.3693669 0.4373343
## sample estimates:
## mean of x
## 0.4033506
# 0.3693669 0.4373343
#Null Hypothesis would be that mean video game sales (.365) falls between .369 and .437
avgsales<-mean(videogames.clean$Global_Sales,na.rm=T)
avgsales
## [1] 0.3650039
#0.365
#p-value = 0.02699
#Mean does not fall into 95% confidence interval so we reject Null Hypothesis.
#Cohen.d(t.test, data= videogames.clean)
# Cohen's effect size
abs(mean(actions$Global_Sales) - mean(videogames.clean$Global_Sales)) / sqrt((sd(actions$Global_Sales)^2 + (sd(videogames.clean$Global_Sales)^2)) / 2)
## [1] 0.04389653
#0.04389653
Assumptions for Two-Sided Significance Test for Comparing Two Population Means: 1) A quantitative response variable for two groups - in this case is our sales which is quantitative. 2) Independent Random Samples - our data isn’t random 3) Approx. Normal Population for each group - Not true, the sales of video games has a major right skew as most of the video games sold do not sell very well. The majority of games sell less than $ 1 million US Dollars.
videogames.clean %>%
filter(Genre == "Sports") %>%
ggplot(aes(x=Global_Sales)) +
geom_histogram()
videogames.clean %>%
filter(Genre == "Shooter") %>%
ggplot(aes(x=Global_Sales)) +
geom_histogram()
sports <- videogames.clean %>%
filter(Genre=="Sports")
shooter <- videogames.clean %>%
filter(Genre=="Shooter")
t.test(sports$Global_Sales, shooter$Global_Sales)
##
## Welch Two Sample t-test
##
## data: sports$Global_Sales and shooter$Global_Sales
## t = -5.1181, df = 2008.7, p-value = 3.381e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3097656 -0.1381382
## sample estimates:
## mean of x mean of y
## 0.4707244 0.6946764
cohend <- abs(mean(sports$Global_Sales, na.rm = TRUE)-mean(shooter$Global_Sales, na.rm = TRUE))/sd(videogames.clean$Global_Sales, na.rm = TRUE)
cohend
## [1] 0.2761827